In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore

Breast cancer¶

It is a disease in which cells in the breast grow out of control. There are different kinds of breast cancer. The kind of breast cancer depends on which cells in the breast turn into cancer.¶

Breast cancer can begin in different parts of the breast. A breast is made up of three main parts: lobules, ducts, and connective tissue. The lobules are the glands that produce milk. The ducts are tubes that carry milk to the nipple. The connective tissue (which consists of fibrous and fatty tissue) surrounds and holds everything together. Most breast cancers begin in the ducts or lobules.¶

Breast cancer can spread outside the breast through blood vessels and lymph vessels. When breast cancer spreads to other parts of the body, it is said to have metastasized.¶

In [2]:
df=pd.read_csv('BC.csv')
df
Out[2]:
mean_radius mean_texture mean_perimeter mean_area mean_smoothness diagnosis
0 17.99 10.38 122.80 1001.0 0.11840 0
1 20.57 17.77 132.90 1326.0 0.08474 0
2 19.69 21.25 130.00 1203.0 0.10960 0
3 11.42 20.38 77.58 386.1 0.14250 0
4 20.29 14.34 135.10 1297.0 0.10030 0
... ... ... ... ... ... ...
564 21.56 22.39 142.00 1479.0 0.11100 0
565 20.13 28.25 131.20 1261.0 0.09780 0
566 16.60 28.08 108.30 858.1 0.08455 0
567 20.60 29.33 140.10 1265.0 0.11780 0
568 7.76 24.54 47.92 181.0 0.05263 1

569 rows × 6 columns

In [42]:
df.head()
Out[42]:
mean_radius mean_texture mean_perimeter mean_area mean_smoothness diagnosis
0 17.99 10.38 122.80 1001.0 0.11840 0
1 20.57 17.77 132.90 1326.0 0.08474 0
2 19.69 21.25 130.00 1203.0 0.10960 0
3 11.42 20.38 77.58 386.1 0.14250 0
4 20.29 14.34 135.10 1297.0 0.10030 0
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mean_radius      569 non-null    float64
 1   mean_texture     569 non-null    float64
 2   mean_perimeter   569 non-null    float64
 3   mean_area        569 non-null    float64
 4   mean_smoothness  569 non-null    float64
 5   diagnosis        569 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 26.8 KB
In [4]:
df.describe()
Out[4]:
mean_radius mean_texture mean_perimeter mean_area mean_smoothness diagnosis
count 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 14.127292 19.289649 91.969033 654.889104 0.096360 0.627417
std 3.524049 4.301036 24.298981 351.914129 0.014064 0.483918
min 6.981000 9.710000 43.790000 143.500000 0.052630 0.000000
25% 11.700000 16.170000 75.170000 420.300000 0.086370 0.000000
50% 13.370000 18.840000 86.240000 551.100000 0.095870 1.000000
75% 15.780000 21.800000 104.100000 782.700000 0.105300 1.000000
max 28.110000 39.280000 188.500000 2501.000000 0.163400 1.000000
In [5]:
df.isnull().sum()
Out[5]:
mean_radius        0
mean_texture       0
mean_perimeter     0
mean_area          0
mean_smoothness    0
diagnosis          0
dtype: int64
In [6]:
df.tail()
Out[6]:
mean_radius mean_texture mean_perimeter mean_area mean_smoothness diagnosis
564 21.56 22.39 142.00 1479.0 0.11100 0
565 20.13 28.25 131.20 1261.0 0.09780 0
566 16.60 28.08 108.30 858.1 0.08455 0
567 20.60 29.33 140.10 1265.0 0.11780 0
568 7.76 24.54 47.92 181.0 0.05263 1
In [37]:
sns.pairplot(df,hue='diagnosis')
Out[37]:
<seaborn.axisgrid.PairGrid at 0x135689fe850>
In [38]:
df['diagnosis'].value_counts().plot.pie(autopct="%.2f%%")
Out[38]:
<AxesSubplot:ylabel='diagnosis'>
In [39]:
plt.figure(figsize=(14,6))
corr=df.corr(method='pearson')
heatmap=sns.heatmap(corr,annot=True,vmax=1,vmin=-1,linewidths=1,linecolor='White')
plt.show()
In [9]:
for i in df.columns:
    sns.distplot(df[i])
    plt.show()
In [10]:
for i in df.columns:
    sns.boxplot(df[i])
    plt.show()
In [41]:
import plotly.express as px
fig = px.scatter_3d(df, x='mean_texture', y='mean_radius', z='diagnosis',
              color='mean_area')
fig.show()

MACHINE LEARNING MODEL¶

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report

KNN¶

In [14]:
X=df.iloc[:,:-1]
y=df['diagnosis']
In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=123)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
(455, 5) (455,)
(114, 5) (114,)
In [17]:
se=StandardScaler()
X_train=se.fit_transform(X_train)
X_test=se.fit_transform(X_test)
In [18]:
kn=KNeighborsClassifier()
kn.fit(X_train,y_train)
Out[18]:
KNeighborsClassifier()
In [19]:
y_train_pred=kn.predict(X_train)
y_test_pred=kn.predict(X_test)
In [20]:
print(accuracy_score(y_train,y_train_pred))
print(accuracy_score(y_test,y_test_pred))
0.9318681318681319
0.9122807017543859

LOGISTIC REGRESSION¶

In [21]:
from sklearn.linear_model import LogisticRegression
In [22]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
Out[22]:
LogisticRegression()
In [23]:
y_train_pred=lr.predict(X_train)
y_test_pred=lr.predict(X_test)
In [24]:
print(accuracy_score(y_train,y_train_pred))
print(accuracy_score(y_test,y_test_pred))
0.9296703296703297
0.9385964912280702
In [25]:
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
              precision    recall  f1-score   support

           0       0.93      0.88      0.90       171
           1       0.93      0.96      0.94       284

    accuracy                           0.93       455
   macro avg       0.93      0.92      0.92       455
weighted avg       0.93      0.93      0.93       455

              precision    recall  f1-score   support

           0       0.90      0.93      0.92        41
           1       0.96      0.95      0.95        73

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114

DECISION TREE¶

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
In [27]:
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
Out[27]:
DecisionTreeClassifier()
In [29]:
y_train_pred=dt.predict(X_train)
y_test_pred=dt.predict(X_test)
In [30]:
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       171
           1       1.00      1.00      1.00       284

    accuracy                           1.00       455
   macro avg       1.00      1.00      1.00       455
weighted avg       1.00      1.00      1.00       455

              precision    recall  f1-score   support

           0       0.84      0.90      0.87        41
           1       0.94      0.90      0.92        73

    accuracy                           0.90       114
   macro avg       0.89      0.90      0.90       114
weighted avg       0.91      0.90      0.90       114

In [33]:
dt1=DecisionTreeClassifier(max_depth=6)
dt1.fit(X_train,y_train)
Out[33]:
DecisionTreeClassifier(max_depth=6)
In [34]:
y_train_pred=dt1.predict(X_train)
y_test_pred=dt1.predict(X_test)
In [35]:
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       171
           1       1.00      0.99      0.99       284

    accuracy                           0.99       455
   macro avg       0.99      0.99      0.99       455
weighted avg       0.99      0.99      0.99       455

              precision    recall  f1-score   support

           0       0.84      0.90      0.87        41
           1       0.94      0.90      0.92        73

    accuracy                           0.90       114
   macro avg       0.89      0.90      0.90       114
weighted avg       0.91      0.90      0.90       114

In [36]:
fig,ax=plt.subplots(figsize=(10,10))
chart=plot_tree(dt1,max_depth=6,feature_names=X.columns,filled=True,fontsize=10)
In [ ]: